{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\Deng\\AppData\\Local\\Temp\\jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4:...Preprocess   Corpus ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 0.616 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 2/4:...Collect co-occurrency information ...\n",
      "Step 3/4:...Calculate   mutual information ...\n",
      "Step 4/4:...Save    candidate words ...\n",
      "Finish! used 48.69 s\n"
     ]
    }
   ],
   "source": [
    "import cntext as ct\n",
    "import os\n",
    "\n",
    "sopmier = ct.SoPmi(cwd=os.getcwd(),\n",
    "                   input_txt_file='data/sopmi_corpus.txt',  #raw corpus data，txt file.only support chinese data now.\n",
    "                   seedword_txt_file='data/sopmi_seed_words.txt', #muanually selected seed words\n",
    "                  )   \n",
    "\n",
    "sopmier.sopmi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4:...Preprocess   corpus ...\n",
      "Step 2/4:...Train  word2vec model\n",
      "            used   53 s\n",
      "Step 3/4:...Prepare similar candidates for each seed word in the word2vec model...\n",
      "Step 4/4 Finish! Used 69 s\n",
      "Step 3/4:...Prepare similar candidates for each seed word in the word2vec model...\n",
      "Step 4/4 Finish! Used 69 s\n",
      "Step 3/4:...Prepare similar candidates for each seed word in the word2vec model...\n",
      "Step 4/4 Finish! Used 69 s\n",
      "Step 3/4:...Prepare similar candidates for each seed word in the word2vec model...\n",
      "Step 4/4 Finish! Used 69 s\n",
      "Step 3/4:...Prepare similar candidates for each seed word in the word2vec model...\n",
      "Step 4/4 Finish! Used 69 s\n"
     ]
    }
   ],
   "source": [
    "import cntext as ct\n",
    "import os\n",
    "\n",
    "\n",
    "#Init W2VModels. Support English and Chinese\n",
    "model = ct.W2VModels(cwd=os.getcwd(), \n",
    "                     lang='english')  #corpus data w2v_corpus.txt\n",
    "\n",
    "model.train(input_txt_file='data/w2v_corpus.txt', \n",
    "            ngram=True)\n",
    "\n",
    "# According to the seed word, filter out the top 100 words that are most similar to each category words\n",
    "model.find(seedword_txt_file='data/w2v_seeds/integrity.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/innovation.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/quality.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/respect.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/teamwork.txt', \n",
    "           topn=100)\n",
    "\n",
    "\n",
    "#保存词向量模型(txt文件)\n",
    "#该文件位于output/Glove/w2v_model.txt\n",
    "model.save(model_name='w2v_model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 需要注意\n",
    "\n",
    "训练出的w2v模型可以后续中使用。\n",
    "\n",
    "```python\n",
    "from gensim.models import KeyedVectors\n",
    "#\n",
    "#w2v_model = KeyedVectors.load(w2v.model路径)\n",
    "w2v_model = KeyedVectors.load_word2vec_format('output/Word2Vec/w2v_model.txt')\n",
    "\n",
    "#找出最相近的词\n",
    "#w2v_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])\n",
    "#w2v_model.most_similar_cosmul(positive=['女人', '国王'], negative=['男人'])\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import KeyedVectors\n",
    "\n",
    "w2v_model = KeyedVectors.load_word2vec_format('output/Word2Vec/w2v_model.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>.</th>\n",
       "      <th>bus</th>\n",
       "      <th>by</th>\n",
       "      <th>day</th>\n",
       "      <th>every</th>\n",
       "      <th>go</th>\n",
       "      <th>i</th>\n",
       "      <th>night</th>\n",
       "      <th>school</th>\n",
       "      <th>theatre</th>\n",
       "      <th>to</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>.</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bus</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>by</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>day</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>every</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>go</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>i</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>night</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>school</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>theatre</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>to</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         .  bus  by  day  every  go  i  night  school  theatre  to\n",
       ".        0    1   1    0      0   0  0      0       0        0   0\n",
       "bus      1    0   2    1      0   0  0      1       0        0   0\n",
       "by       1    2   0    1      2   0  0      1       0        0   0\n",
       "day      0    1   1    0      1   0  0      0       1        0   0\n",
       "every    0    0   2    1      0   0  0      1       1        1   2\n",
       "go       0    0   0    0      0   0  2      0       1        1   2\n",
       "i        0    0   0    0      0   2  0      0       0        0   2\n",
       "night    0    1   1    0      1   0  0      0       0        1   0\n",
       "school   0    0   0    1      1   1  0      0       0        0   1\n",
       "theatre  0    0   0    0      1   1  0      1       0        0   1\n",
       "to       0    0   0    0      2   2  2      0       1        1   0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import cntext as ct\n",
    "\n",
    "documents = [\"I go to school every day by bus .\",\n",
    "             \"i go to theatre every night by bus\"]\n",
    "\n",
    "ct.co_occurrence_matrix(documents, window_size=2, lang='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/sc/3mnt5tgs419_hk7s16gq61p80000gn/T/jieba.cache\n",
      "Loading model cost 0.565 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Python</th>\n",
       "      <th>好玩</th>\n",
       "      <th>学</th>\n",
       "      <th>很</th>\n",
       "      <th>是</th>\n",
       "      <th>最好</th>\n",
       "      <th>的</th>\n",
       "      <th>编程</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Python</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>好玩</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>学</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>很</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>是</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最好</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>的</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>编程</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Python  好玩  学  很  是  最好  的  编程\n",
       "Python       0   0  0  0  1   1  0   0\n",
       "好玩           0   0  0  1  0   0  0   1\n",
       "学            0   0  0  0  1   1  1   1\n",
       "很            0   1  0  0  0   0  0   1\n",
       "是            1   0  1  0  0   1  0   0\n",
       "最好           1   0  1  0  1   0  1   0\n",
       "的            0   0  1  0  0   1  0   1\n",
       "编程           0   1  1  1  0   0  1   0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents2 = [\"编程很好玩\",\n",
    "             \"Python是最好学的编程\"]\n",
    "\n",
    "ct.co_occurrence_matrix(documents2, window_size=2, lang='chinese')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Glove\n",
    "\n",
    "train the glove embeddings for ``data/brown_corpus.txt``"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on class Glove in module cntext.dictionary:\n",
      "\n",
      "class Glove(builtins.object)\n",
      " |  Glove(cwd, lang='chinese')\n",
      " |  \n",
      " |  Methods defined here:\n",
      " |  \n",
      " |  __init__(self, cwd, lang='chinese')\n",
      " |      initialize the Glove model\n",
      " |      \n",
      " |      :param lang: set language for Glove model\n",
      " |      :return:\n",
      " |  \n",
      " |  cooccurrence_matrix(self)\n",
      " |      Create cooccurrence matrix\n",
      " |      \n",
      " |      :return:\n",
      " |  \n",
      " |  create_vocab(self, file, min_count=5)\n",
      " |      create vocabulary\n",
      " |      \n",
      " |      :param file:  corpus file path, only support .txt file now!\n",
      " |      :params min_count: When building the vocabulary ignore terms that wordcount strictly higher than the given threshold\n",
      " |      :return:\n",
      " |  \n",
      " |  save(self, model_name='glove_model')\n",
      " |      Save glove embeddings as a txt file. Note we will use gensim to convert the glove embeddings in word2vec format\n",
      " |      \n",
      " |      :params model_name: txt file name; save the glove model to txt file.\n",
      " |      :return:\n",
      " |  \n",
      " |  train_embeddings(self, vector_size=50, max_iter=25)\n",
      " |      Train glove embeddings\n",
      " |      \n",
      " |      :params vector_size:  Dimensionality of the word vectors. Default: 50\n",
      " |      :params max_iter: Number of training epochs. Default: 25\n",
      " |      :return:\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data descriptors defined here:\n",
      " |  \n",
      " |  __dict__\n",
      " |      dictionary for instance variables (if defined)\n",
      " |  \n",
      " |  __weakref__\n",
      " |      list of weak references to the object (if defined)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import cntext as ct \n",
    "\n",
    "help(ct.Glove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4: ...Create vocabulary for Glove.\n",
      "Step 2/4: ...Create cooccurrence matrix.\n",
      "Step 3/4: ...Train glove embeddings. \n",
      "             Note, this part takes a long time to run\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iteration 20: error 67155496.43684"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 3/4: ... Finish! Use 359.23 s\n",
      "Step 4/4: ... Save the glove embeddings to a txt file\n"
     ]
    }
   ],
   "source": [
    "import cntext as ct\n",
    "import os\n",
    "\n",
    "\n",
    "model = ct.Glove(cwd=os.getcwd(), lang='english')\n",
    "model.create_vocab(file='data/brown_corpus.txt', min_count=5)\n",
    "model.cooccurrence_matrix()\n",
    "model.train_embeddings(vector_size=50, max_iter=25)\n",
    "model.save(model_name='brown_glove')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4: ...Create vocabulary for Glove.\n",
      "Step 2/4: ...Create cooccurrence matrix.\n",
      "Step 3/4: ...Train glove embeddings. \n",
      "             Note, this part takes a long time to run\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iteration 20: error 19828751.09112"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 3/4: ... Finish! Use 103.78 s\n",
      "Step 4/4: ... Save the glove embeddings to a txt file\n"
     ]
    }
   ],
   "source": [
    "import cntext as ct\n",
    "import os\n",
    "\n",
    "#设置语言和项目文件夹路径\n",
    "model = ct.Glove(cwd=os.getcwd(), lang='english')\n",
    "\n",
    "#导入语料\n",
    "model.create_vocab(file='data/santi.txt', min_count=5)\n",
    "\n",
    "#构建词语共现矩阵\n",
    "model.cooccurrence_matrix()\n",
    "\n",
    "#设置词嵌入模型的向量维度、迭代数\n",
    "model.train_embeddings(vector_size=50, max_iter=25)\n",
    "\n",
    "#存储模型\n",
    "model.save(model_name='santi_glove_model')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入刚刚训练好的glove模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on method load_word2vec_format in module gensim.models.keyedvectors:\n",
      "\n",
      "load_word2vec_format(fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=<class 'numpy.float32'>, no_header=False) method of builtins.type instance\n",
      "    Load KeyedVectors from a file produced by the original C word2vec-tool format.\n",
      "    \n",
      "    Warnings\n",
      "    --------\n",
      "    The information stored in the file is incomplete (the binary tree is missing),\n",
      "    so while you can query for word similarity etc., you cannot continue training\n",
      "    with a model loaded this way.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    fname : str\n",
      "        The file path to the saved word2vec-format file.\n",
      "    fvocab : str, optional\n",
      "        File path to the vocabulary.Word counts are read from `fvocab` filename, if set\n",
      "        (this is the file generated by `-save-vocab` flag of the original C tool).\n",
      "    binary : bool, optional\n",
      "        If True, indicates whether the data is in binary word2vec format.\n",
      "    encoding : str, optional\n",
      "        If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`.\n",
      "    unicode_errors : str, optional\n",
      "        default 'strict', is a string suitable to be passed as the `errors`\n",
      "        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source\n",
      "        file may include word tokens truncated in the middle of a multibyte unicode character\n",
      "        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.\n",
      "    limit : int, optional\n",
      "        Sets a maximum number of word-vectors to read from the file. The default,\n",
      "        None, means read all.\n",
      "    datatype : type, optional\n",
      "        (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.\n",
      "        Such types may result in much slower bulk operations or incompatibility with optimized routines.)\n",
      "    no_header : bool, optional\n",
      "        Default False means a usual word2vec-format file, with a 1st line declaring the count of\n",
      "        following vectors & number of dimensions. If True, the file is assumed to lack a declaratory\n",
      "        (vocab_size, vector_size) header and instead start with the 1st vector, and an extra\n",
      "        reading-pass will be used to discover the number of vectors. Works only with `binary=False`.\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    :class:`~gensim.models.keyedvectors.KeyedVectors`\n",
      "        Loaded model.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(KeyedVectors.load_word2vec_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<gensim.models.keyedvectors.KeyedVectors at 0x13bd7f45af0>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from gensim.models import KeyedVectors\n",
    "\n",
    "model = KeyedVectors.load_word2vec_format('output/Glove/santi_glove_model.txt', no_header=True)\n",
    "#model = KeyedVectors.load_word2vec_format('output/Glove/三体_glove_model.txt', no_header=True)\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.6618259 ,  0.60663235,  0.9849417 , -1.028956  ,  1.0711069 ,\n",
       "       -0.8875306 , -0.52833366, -1.0125595 , -0.9628481 ,  1.0356479 ,\n",
       "        0.8595257 ,  0.7454354 , -1.0468111 , -0.26285014, -1.0310447 ,\n",
       "        0.9906805 ,  0.05825566, -0.85581344, -0.9932533 , -1.020438  ,\n",
       "        1.0495061 , -0.6973389 ,  0.49099424, -0.80775315,  0.64256483,\n",
       "        1.0157642 ,  1.0135043 , -1.0131834 ,  0.17376372,  0.89585054,\n",
       "        0.30890268,  0.798895  ,  0.6653925 ,  0.908629  , -1.048273  ,\n",
       "       -0.35683677,  0.06306187, -1.0267074 , -1.0494691 ,  0.42172813,\n",
       "        0.24005401,  0.5934993 , -0.0696691 , -1.0360557 , -0.9797269 ,\n",
       "        1.0205714 , -0.376359  , -1.0501183 ,  1.0415571 , -0.9312968 ],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.get_vector('宇宙')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "336px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "vscode": {
   "interpreter": {
    "hash": "39efd68f93e60613a22adb0a24f8974eb44aba5452699b12e52bf7b8f47c9669"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
